{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.16666667, 1.        , 0.25      , 0.25      , 0.125     ])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def compute_reciprocals(values):\n",
    "    output = np.empty(len(values))\n",
    "    for i in range(len(values)):\n",
    "        output[i] = 1.0 / values[i]\n",
    "    return output\n",
    "        \n",
    "values = np.random.randint(1, 10, size=5)\n",
    "compute_reciprocals(values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "184 ms ± 4.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "big_array = np.random.randint(1, 100, size=100000)\n",
    "%timeit compute_reciprocals(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "116 µs ± 782 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit (1.0 / big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ufun_reci(x):\n",
    "    return 1/x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "fun_reci=np.frompyfunc(ufun_reci,1,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.9 ms ± 84.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit fun_reci(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numba import vectorize, float64,int32\n",
    "\n",
    "@vectorize([float64( int32)])\n",
    "def ufun_reci(x):\n",
    "    return 1/x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "big_array = np.random.randint(1, 100, size=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.45 µs ± 93.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit ufun_reci(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numba import jit,njit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "@jit\n",
    "def ufun_reci(x):\n",
    "    return 1/x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.4 µs ± 82.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit ufun_reci(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "@njit\n",
    "def ufun_reci(x):\n",
    "    return 1/x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.21 µs ± 36.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit ufun_reci(big_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numba import cuda, float32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "@cuda.jit\n",
    "def reci_kernel(x, out):\n",
    "    tx = cuda.threadIdx.x \n",
    "    ty = cuda.blockIdx.x  \n",
    "\n",
    "    block_size = cuda.blockDim.x  \n",
    "    grid_size = cuda.gridDim.x    \n",
    "    \n",
    "    start = tx + ty * block_size\n",
    "    stride = block_size * grid_size\n",
    "    for i in range(start, x.shape[0], stride):\n",
    "        out[i] = 1/x[i]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "threads_per_block = 128\n",
    "blocks_per_grid = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "big_array = np.random.randint(1, 100, size=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "out=np.random.ranf(10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.15 ms ± 10.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit reci_kernel[blocks_per_grid, threads_per_block](big_array,  out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fri Apr 29 09:34:08 2022       \n",
      "+-----------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 411.31                 Driver Version: 411.31                    |\n",
      "|-------------------------------+----------------------+----------------------+\n",
      "| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
      "|===============================+======================+======================|\n",
      "|   0  GeForce GTX 1050   WDDM  | 00000000:01:00.0  On |                  N/A |\n",
      "|  0%   43C    P0    N/A /  70W |    237MiB /  2048MiB |      5%      Default |\n",
      "+-------------------------------+----------------------+----------------------+\n",
      "                                                                               \n",
      "+-----------------------------------------------------------------------------+\n",
      "| Processes:                                                       GPU Memory |\n",
      "|  GPU       PID   Type   Process name                             Usage      |\n",
      "|=============================================================================|\n",
      "|    0      1244    C+G   ...es\\Google\\Chrome\\Application\\chrome.exe N/A      |\n",
      "|    0      4084      C   D:\\ProgramData\\Anaconda3\\python.exe        N/A      |\n",
      "+-----------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.01492537, 0.01149425, 0.03846154, ..., 0.01886792, 0.05263158,\n",
       "       0.07142857])"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "TPB = 16\n",
    "\n",
    "@cuda.jit\n",
    "def fast_matmul(A, B, C):\n",
    "    # Define an array in the shared memory\n",
    "    # The size and type of the arrays must be known at compile time\n",
    "    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)\n",
    "    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)\n",
    "\n",
    "    x, y = cuda.grid(2)\n",
    "\n",
    "    tx = cuda.threadIdx.x\n",
    "    ty = cuda.threadIdx.y\n",
    "    bpg = cuda.gridDim.x    # blocks per grid\n",
    "\n",
    "    if x >= C.shape[0] and y >= C.shape[1]:\n",
    "        # Quit if (x, y) is outside of valid C boundary\n",
    "        return\n",
    "\n",
    "    # Each thread computes one element in the result matrix.\n",
    "    # The dot product is chunked into dot products of TPB-long vectors.\n",
    "    tmp = 0.\n",
    "    for i in range(bpg):\n",
    "        # Preload data into shared memory\n",
    "        sA[tx, ty] = A[x, ty + i * TPB]\n",
    "        sB[tx, ty] = B[tx + i * TPB, y]\n",
    "\n",
    "        # Wait until all threads finish preloading\n",
    "        cuda.syncthreads()\n",
    "\n",
    "        # Computes partial product on the shared memory\n",
    "        for j in range(TPB):\n",
    "            tmp += sA[tx, j] * sB[j, ty]\n",
    "\n",
    "        # Wait until all threads finish computing\n",
    "        cuda.syncthreads()\n",
    "\n",
    "    C[x, y] = tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "A=np.random.ranf(256)\n",
    "B=np.random.ranf(256)\n",
    "C=np.random.ranf(256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "threads_per_block = 16\n",
    "blocks_per_grid = 16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypingError",
     "evalue": "Failed in nopython mode pipeline (step: nopython frontend)\nInternal error at <numba.core.typeinfer.StaticGetItemConstraint object at 0x000000000EA231F0>.\n\u001b[1m\u001b[1mtuple index out of range\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of static-get-item at <ipython-input-93-7187a13fdd21> (16)\u001b[0m\nEnable logging at debug level for details.\n\u001b[1m\nFile \"<ipython-input-93-7187a13fdd21>\", line 16:\u001b[0m\n\u001b[1mdef fast_matmul(A, B, C):\n    <source elided>\n\n\u001b[1m    if x >= C.shape[0] and y >= C.shape[1]:\n\u001b[0m    \u001b[1m^\u001b[0m\u001b[0m\n",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypingError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-109-867add7cbb15>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mget_ipython\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'timeit'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'fast_matmul[blocks_per_grid, threads_per_block](A,B,C)'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py\u001b[0m in \u001b[0;36mrun_line_magic\u001b[1;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[0;32m   2325\u001b[0m                 \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'local_ns'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_local_scope\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstack_depth\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2326\u001b[0m             \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2327\u001b[1;33m                 \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2328\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2329\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<decorator-gen-54>\u001b[0m in \u001b[0;36mtimeit\u001b[1;34m(self, line, cell, local_ns)\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\magic.py\u001b[0m in \u001b[0;36m<lambda>\u001b[1;34m(f, *a, **k)\u001b[0m\n\u001b[0;32m    185\u001b[0m     \u001b[1;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    186\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 187\u001b[1;33m         \u001b[0mcall\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mk\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    188\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    189\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\magics\\execution.py\u001b[0m in \u001b[0;36mtimeit\u001b[1;34m(self, line, cell, local_ns)\u001b[0m\n\u001b[0;32m   1167\u001b[0m             \u001b[1;32mfor\u001b[0m \u001b[0mindex\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1168\u001b[0m                 \u001b[0mnumber\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m10\u001b[0m \u001b[1;33m**\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1169\u001b[1;33m                 \u001b[0mtime_number\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtimer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimeit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnumber\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1170\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mtime_number\u001b[0m \u001b[1;33m>=\u001b[0m \u001b[1;36m0.2\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1171\u001b[0m                     \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\IPython\\core\\magics\\execution.py\u001b[0m in \u001b[0;36mtimeit\u001b[1;34m(self, number)\u001b[0m\n\u001b[0;32m    167\u001b[0m         \u001b[0mgc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdisable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    168\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 169\u001b[1;33m             \u001b[0mtiming\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minner\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mit\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtimer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    170\u001b[0m         \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    171\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mgcold\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<magic-timeit>\u001b[0m in \u001b[0;36minner\u001b[1;34m(_it, _timer)\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\cuda\\compiler.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m    767\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    768\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 769\u001b[1;33m         return self.dispatcher.call(args, self.griddim, self.blockdim,\n\u001b[0m\u001b[0;32m    770\u001b[0m                                     self.stream, self.sharedmem)\n\u001b[0;32m    771\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\cuda\\compiler.py\u001b[0m in \u001b[0;36mcall\u001b[1;34m(self, args, griddim, blockdim, stream, sharedmem)\u001b[0m\n\u001b[0;32m    859\u001b[0m         argtypes = tuple(\n\u001b[0;32m    860\u001b[0m             [self.typingctx.resolve_argument_type(a) for a in args])\n\u001b[1;32m--> 861\u001b[1;33m         \u001b[0mkernel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margtypes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    862\u001b[0m         \u001b[0mkernel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlaunch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgriddim\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mblockdim\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msharedmem\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    863\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\cuda\\compiler.py\u001b[0m in \u001b[0;36mcompile\u001b[1;34m(self, sig)\u001b[0m\n\u001b[0;32m    928\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_compile\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    929\u001b[0m                 \u001b[1;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Compilation disabled\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 930\u001b[1;33m             kernel = compile_kernel(self.py_func, argtypes,\n\u001b[0m\u001b[0;32m    931\u001b[0m                                     \u001b[0mlink\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlink\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    932\u001b[0m                                     **self.targetoptions)\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m     30\u001b[0m         \u001b[1;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     31\u001b[0m             \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 32\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     33\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\cuda\\compiler.py\u001b[0m in \u001b[0;36mcompile_kernel\u001b[1;34m(pyfunc, args, link, debug, inline, fastmath, extensions, max_registers, opt)\u001b[0m\n\u001b[0;32m     55\u001b[0m def compile_kernel(pyfunc, args, link, debug=False, inline=False,\n\u001b[0;32m     56\u001b[0m                    fastmath=False, extensions=[], max_registers=None, opt=True):\n\u001b[1;32m---> 57\u001b[1;33m     \u001b[0mcres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompile_cuda\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpyfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvoid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdebug\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdebug\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minline\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0minline\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     58\u001b[0m     \u001b[0mfname\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcres\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfndesc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mllvm_func_name\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     59\u001b[0m     lib, kernel = cres.target_context.prepare_cuda_kernel(cres.library, fname,\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m     30\u001b[0m         \u001b[1;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     31\u001b[0m             \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 32\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     33\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\cuda\\compiler.py\u001b[0m in \u001b[0;36mcompile_cuda\u001b[1;34m(pyfunc, return_type, args, debug, inline)\u001b[0m\n\u001b[0;32m     38\u001b[0m         \u001b[0mflags\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'forceinline'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     39\u001b[0m     \u001b[1;31m# Run compilation pipeline\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 40\u001b[1;33m     cres = compiler.compile_extra(typingctx=typingctx,\n\u001b[0m\u001b[0;32m     41\u001b[0m                                   \u001b[0mtargetctx\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtargetctx\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     42\u001b[0m                                   \u001b[0mfunc\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mpyfunc\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler.py\u001b[0m in \u001b[0;36mcompile_extra\u001b[1;34m(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)\u001b[0m\n\u001b[0;32m    625\u001b[0m     pipeline = pipeline_class(typingctx, targetctx, library,\n\u001b[0;32m    626\u001b[0m                               args, return_type, flags, locals)\n\u001b[1;32m--> 627\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mpipeline\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompile_extra\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    628\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    629\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler.py\u001b[0m in \u001b[0;36mcompile_extra\u001b[1;34m(self, func)\u001b[0m\n\u001b[0;32m    361\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlifted\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    362\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlifted_from\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 363\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_compile_bytecode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    364\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    365\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mcompile_ir\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc_ir\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlifted\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlifted_from\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler.py\u001b[0m in \u001b[0;36m_compile_bytecode\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    423\u001b[0m         \"\"\"\n\u001b[0;32m    424\u001b[0m         \u001b[1;32massert\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc_ir\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 425\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_compile_core\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    426\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    427\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_compile_ir\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler.py\u001b[0m in \u001b[0;36m_compile_core\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    403\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstatus\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfail_reason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    404\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mis_final_pipeline\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 405\u001b[1;33m                     \u001b[1;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    406\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    407\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mCompilerError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"All available pipelines exhausted\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler.py\u001b[0m in \u001b[0;36m_compile_core\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    394\u001b[0m             \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    395\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 396\u001b[1;33m                 \u001b[0mpm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    397\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcr\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    398\u001b[0m                     \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler_machinery.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, state)\u001b[0m\n\u001b[0;32m    339\u001b[0m                     \u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpipeline_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpass_desc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    340\u001b[0m                 \u001b[0mpatched_exception\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_patch_error\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 341\u001b[1;33m                 \u001b[1;32mraise\u001b[0m \u001b[0mpatched_exception\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    342\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    343\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mdependency_analysis\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler_machinery.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, state)\u001b[0m\n\u001b[0;32m    330\u001b[0m                 \u001b[0mpass_inst\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_pass_registry\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpass_inst\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    331\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpass_inst\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mCompilerPass\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 332\u001b[1;33m                     \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_runPass\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0midx\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpass_inst\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstate\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    333\u001b[0m                 \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    334\u001b[0m                     \u001b[1;32mraise\u001b[0m \u001b[0mBaseException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Legacy pass in use\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler_lock.py\u001b[0m in \u001b[0;36m_acquire_compile_lock\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m     30\u001b[0m         \u001b[1;32mdef\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     31\u001b[0m             \u001b[1;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 32\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     33\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0m_acquire_compile_lock\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler_machinery.py\u001b[0m in \u001b[0;36m_runPass\u001b[1;34m(self, index, pss, internal_state)\u001b[0m\n\u001b[0;32m    289\u001b[0m             \u001b[0mmutated\u001b[0m \u001b[1;33m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_initialization\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    290\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0mSimpleTimer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpass_time\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 291\u001b[1;33m             \u001b[0mmutated\u001b[0m \u001b[1;33m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_pass\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    292\u001b[0m         \u001b[1;32mwith\u001b[0m \u001b[0mSimpleTimer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfinalize_time\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    293\u001b[0m             \u001b[0mmutated\u001b[0m \u001b[1;33m|=\u001b[0m \u001b[0mcheck\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpss\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_finalizer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minternal_state\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\compiler_machinery.py\u001b[0m in \u001b[0;36mcheck\u001b[1;34m(func, compiler_state)\u001b[0m\n\u001b[0;32m    262\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    263\u001b[0m         \u001b[1;32mdef\u001b[0m \u001b[0mcheck\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcompiler_state\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 264\u001b[1;33m             \u001b[0mmangled\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcompiler_state\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    265\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mmangled\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32min\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    266\u001b[0m                 msg = (\"CompilerPass implementations should return True/False. \"\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\typed_passes.py\u001b[0m in \u001b[0;36mrun_pass\u001b[1;34m(self, state)\u001b[0m\n\u001b[0;32m     90\u001b[0m                               % (state.func_id.func_name,)):\n\u001b[0;32m     91\u001b[0m             \u001b[1;31m# Type inference\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 92\u001b[1;33m             typemap, return_type, calltypes = type_inference_stage(\n\u001b[0m\u001b[0;32m     93\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtypingctx\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     94\u001b[0m                 \u001b[0mstate\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc_ir\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\typed_passes.py\u001b[0m in \u001b[0;36mtype_inference_stage\u001b[1;34m(typingctx, interp, args, return_type, locals, raise_errors)\u001b[0m\n\u001b[0;32m     68\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     69\u001b[0m         \u001b[0minfer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbuild_constraint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 70\u001b[1;33m         \u001b[0minfer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpropagate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     71\u001b[0m         \u001b[0mtypemap\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrestype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcalltypes\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minfer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munify\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mraise_errors\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     72\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\core\\typeinfer.py\u001b[0m in \u001b[0;36mpropagate\u001b[1;34m(self, raise_errors)\u001b[0m\n\u001b[0;32m   1069\u001b[0m                                   if isinstance(e, ForceLiteralArg)]\n\u001b[0;32m   1070\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mforce_lit_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1071\u001b[1;33m                     \u001b[1;32mraise\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1072\u001b[0m                 \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1073\u001b[0m                     \u001b[1;32mraise\u001b[0m \u001b[0mreduce\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0moperator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mor_\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mforce_lit_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mTypingError\u001b[0m: Failed in nopython mode pipeline (step: nopython frontend)\nInternal error at <numba.core.typeinfer.StaticGetItemConstraint object at 0x000000000EA231F0>.\n\u001b[1m\u001b[1mtuple index out of range\u001b[0m\n\u001b[0m\u001b[1mDuring: typing of static-get-item at <ipython-input-93-7187a13fdd21> (16)\u001b[0m\nEnable logging at debug level for details.\n\u001b[1m\nFile \"<ipython-input-93-7187a13fdd21>\", line 16:\u001b[0m\n\u001b[1mdef fast_matmul(A, B, C):\n    <source elided>\n\n\u001b[1m    if x >= C.shape[0] and y >= C.shape[1]:\n\u001b[0m    \u001b[1m^\u001b[0m\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "%timeit fast_matmul[blocks_per_grid, threads_per_block](A,B,C)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "ename": "NotImplementedError",
     "evalue": "<class 'numba.cuda.stubs.grid'> is not instantiable",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNotImplementedError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-111-cc677ab6aeef>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcuda\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgrid\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\numba\\cuda\\stubs.py\u001b[0m in \u001b[0;36m__new__\u001b[1;34m(cls)\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     20\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__new__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 21\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"%s is not instantiable\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     22\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     23\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__repr__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNotImplementedError\u001b[0m: <class 'numba.cuda.stubs.grid'> is not instantiable"
     ]
    }
   ],
   "source": [
    "cuda.grid()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
